set seed 122334455

version 18

cd "$dropbox/Oasis/2. Pathways to Choice/9.1 Final Analysis" // change to local directory; this should be the same directory where the survey_data.dta file is stored

cap mkdir "Output" // add output folder if it doesn't already exist in local directory

*Packages requiring user installation through --ssc install--:
	*coefplot
	*boottest
	*ritest
	*texsave

********************************************************************************

*Table of Contents:
	*1. Build globals containing controls
	*2. Figure 1
	*3. Figure 2
	*4. Figure 3
	*5. Extended Data Table 1
	*6. Extended Data Table 2
	*7. Extended Data Figure 1
	*8. Extended Data Table 3
	*9. Extended Data Table 4
	*10. Extended Data Table 5
	*11. Extended Data Table 6
	*12. Extended Data Figure 2
	*13. Extended Data Table 7
	*14. Extended Data Table 8
	*15. Other Miscellaneous Statistics
	*16. Supplementary Information Figure 1 
	*17. Supplementary Information Table 1 
	*18. Supplementary Information Table 2 
	
********************************************************************************	

use "data_for_analysis.dta", clear

********************************************************************************

*1. Build globals containing controls

*Create global containing controls for unbalanced vars at baseline, direct and interacted
global controls timespent_school_base_d timespent_school_base_miss_d self_index_z_base_d ///
	self_index_z_base_miss_d social_index_z_base_d social_index_z_base_miss_d ///
	socioecon_index_z_base_d socioecon_index_z_base_miss_d ///
	c.timespent_school_base_d#c.treated c.timespent_school_base_miss_d#c.treated c.self_index_z_base_d#c.treated ///
	c.self_index_z_base_miss_d#c.treated c.social_index_z_base_d#c.treated c.social_index_z_base_miss_d#c.treated ///
	c.socioecon_index_z_base_d#c.treated c.socioecon_index_z_base_miss_d#c.treated 

*Use loop to create global containing controls for all vars at baseline, direct and interacted
*start with raw variable list
local controls_c unmarried_base attend_school_base ever_school_base self_index_z_base social_index_z_base nr_friends_base norm_womanpos_z_base timespent_school_base terms_current_base completed_years_base ysis_currschool_base ysis_shareschool_base ybro_currschool_base ybro_shareschool_base want_highered_base agetomarry_base marriage_adv_index_z_base anywork_lastyear_base age_base socioecon_index_z_base

*use loop to build global containing variable, missing dummy, and interactions with both
global controls_c `controls_c'
foreach var of varlist `controls_c' {
	global controls_c $controls_c `var'_d `var'_miss_d c.`var'_d#c.treated c.`var'_miss_d#c.treated
}

********************************************************************************

*2. Figure 1

preserve
*Collapse data to calculate mean and standard errors for remaining unmarried rates by treatment status
collapse (mean) unmarried (semean) se_unmarried = unmarried, by(treated)

*Calculate +/- standard error
gen unmarried_upper = unmarried + se_unmarried
gen unmarried_lower = unmarried - se_unmarried

*Graph results
twoway (bar unmarried treated, barwidth(0.4) ///
    ylabel(0(0.1)1, angle(0)) ///
    legend(off) ///
    ytitle("Percent of Girls Not Married") ///
    scheme(s2mono) ///
    graphregion(color(white)) ///
    xtitle("") ///
    ylabel(0 "0%" .2 "20%" .4 "40%" .6 "60%" .8 "80%" 1 "100%", angle(0) gmin) ///
    xlabel(0 "Control" 1 "Pathways", noticks)) ///
    (rcap unmarried_lower unmarried_upper treated, lwidth(thick)) ///
	(scatter unmarried treated, msize(vsmall) mcolor(black) msymbol(circle))
graph export "Output/Fig1_unmarried_rates_endline.eps", as(png) replace
graph export "Output/Fig1_unmarried_rates_endline.png", as(png) replace
restore

********************************************************************************

*3. Figure 2

*Plot results from multiple regressions

preserve

*Create data frame inside stata
gen varname = ""
gen coef = .
gen lb = .
gen ub = .
gen pvalue = .

local x = 1
foreach var of varlist attend_school social_index_z self_index_z norm_womanpos_z marriage_adv_index_z {
	
	*for each outcome, regress on treatment status (renamed for convenience of exporting)
    gen treated_`var' = treated
    eststo `var' : reg `var' treated_`var' `var'_base i.pair_id `var'_base_miss $controls, cluster(community_id)
    
    *run boottest to get wild bootstrap cluster p values
    boottest treated_`var', nograph boottype(wild) cluster(community_id) reps(999) seed(122334455) level(90)

    * extract coefficient and p-value
    local coef = _b[treated_`var']
    local pval = r(pvalue)
    
    *extract confidence interval lower and upper bounds
	matrix CI = r(CI)
	local lower_bound = CI[1,1]
	local upper_bound = CI[1,2]

    *store these values in row based on location in loop
    replace varname = "`var'" in `x'
    replace coef = `coef' in `x'
    replace pvalue = `pval' in `x'
    replace lb = `lower_bound' in `x'
    replace ub = `upper_bound' in `x'
    
	*increase local to move to next row
    local x = `x' + 1
}

* Drop extra observations and save as a temporary dataset
drop if missing(coef)
tempfile plotdata
save `plotdata', replace

* Load the dataset for plotting
use `plotdata', clear

* Create a numeric variable for plotting
gen varnum = _n
label define varnum_lbl 1 "attend_school" 2 "social_index_z" 3 "self_index_z" 4 "norm_womanpos_z" 5 "marriage_adv_index_z"
label values varnum varnum_lbl

// Plot the coefficient estimates 

twoway (scatter varnum coef , msymbol(O)) ///
       (rcap lb ub varnum, horizontal), ///
       ylabel(1 `" "Attends" "School (d)" "' 2 `" "Social Support" "Index (sd)" "' 3 `" "Self-Perception" "Index (sd)" "' 4 `" "Gender Equitable" "Beliefs Index (sd)" "' 5 `" "Marriage Self-Advocacy" "Index (sd)" "', angle(0) gstyle(miniscule)) ///
	   xlabel(-0.5(0.5)3) ///
	   yscale(reverse) ///
       xline(0, lpattern(solid) lcolor(black)) ///
	   xtitle("Coefficient estimate") ///
	   ytitle("Outcome") ///
	   legend(off) ///
	   scheme(s2mono) graphregion(color(white))
	   
graph export "Output/Fig2_indices_coefplot_w_boottest.eps", as(png) replace
graph export "Output/Fig2_indices_coefplot_w_boottest.png", as(png) replace

restore

********************************************************************************

*4. Figure 3

*Plot coefficients from two regressions on treatment interacted with variables

*First regression
preserve

*Create data frame inside stata
gen varname = ""
gen coef = .
gen lb = .
gen ub = .
gen pvalue = .

local x = 1
foreach var of varlist unmarried {
	replace var = "`var'" in `x'
	eststo `var': regress `var' treated nr_sibgirls_d nr_sibgirls_treated nr_samefather_d c.treated#c.nr_samefather_d `var'_base i.pair_id `var'_base_miss $controls, cluster(community_id)
    
   *run boottest to get wild bootstrap cluster p values for treatment 
    boottest treated, nograph boottype(wild) cluster(community_id) reps(999) seed(1234) level(90)

    *extract coefficients of treated interaction, and p-value
    local coef = _b[treated]
    local pval = r(pvalue)
    
    *extract CI lower and upper bounds
	matrix CI = r(CI)
	local lower_bound = CI[1,1]
	local upper_bound = CI[1,2]

    *store these values in row based on location in loop
    replace varname = "`var'" in `x'
    replace coef = `coef' in `x'
    replace pvalue = `pval' in `x'
    replace lb = `lower_bound' in `x'
    replace ub = `upper_bound' in `x'
    local x = `x' + 1
	
	*run boottest to get wild bootstrap cluster p values for treatment interaction
    boottest c.treated#c.nr_samefather_d, nograph boottype(wild) cluster(community_id) reps(999) seed(1234) level(90)

    *extract coefficients of treated interaction, and p-value
    local coef = _b[c.treated#c.nr_samefather_d]
    local pval = r(pvalue)
    
    *extract CI lower and upper bounds
	matrix CI = r(CI)
	local lower_bound = CI[1,1]
	local upper_bound = CI[1,2]

    *store these values in row based on location in loop
    replace varname = "`var'" in `x'
    replace coef = `coef' in `x'
    replace pvalue = `pval' in `x'
    replace lb = `lower_bound' in `x'
    replace ub = `upper_bound' in `x'
	
	*increase local to move to next row
    local x = `x' + 1
}

keep varname - _est_unmarried // keep on the new datafrarme
keep in 2 // keep only the interaction coefficient

*Save as temp dataset
save "plotdata_1.dta", replace
restore

*Create data frame inside stata
preserve
gen varname = ""
gen coef = .
gen lb = .
gen ub = .
gen pvalue = .

local x = 1
foreach var of varlist unmarried {
	replace var = "`var'" in `x'
	eststo `var': regress `var' c.treated#c.avg_ageofothers_d c.avg_ageofothers_d c.treated `var'_base i.pair_id `var'_base_miss $controls age_base age_base_miss, cluster(community_id)  
    
    *run boottest to get wild bootstrap cluster p values for treatment 
    boottest treated, nograph boottype(wild) cluster(community_id) reps(999) seed(1234) level(90)

    *extract coefficients of treated interaction, and p-value
    local coef = _b[treated]
    local pval = r(pvalue)
    
    *extract CI lower and upper bounds
	matrix CI = r(CI)
	local lower_bound = CI[1,1]
	local upper_bound = CI[1,2]

    *store these values in row based on location in loop
    replace varname = "`var'" in `x'
    replace coef = `coef' in `x'
    replace pvalue = `pval' in `x'
    replace lb = `lower_bound' in `x'
    replace ub = `upper_bound' in `x'
    local x = `x' + 1
	
	*run boottest to get wild bootstrap cluster p values for treatment interaction
    boottest c.treated#c.avg_ageofothers_d, nograph boottype(wild) cluster(community_id) reps(999) seed(1234) level(90)

    *extract coefficients of treated interaction, and p-value
    local coef = _b[c.treated#c.avg_ageofothers_d]
    local pval = r(pvalue)
    
    *extract CI lower and upper bounds
	matrix CI = r(CI)
	local lower_bound = CI[1,1]
	local upper_bound = CI[1,2]

    *store these values in row based on location in loop
    replace varname = "`var'" in `x'
    replace coef = `coef' in `x'
    replace pvalue = `pval' in `x'
    replace lb = `lower_bound' in `x'
    replace ub = `upper_bound' in `x'
	
	*increase local to move to next row
    local x = `x' + 1
}

keep varname - _est_unmarried // keep only the new dataframe
keep in 2 // keep only the interaction coefficient

* Save as temp dataset
save "plotdata_2.dta", replace
restore

*Combine two temp datasets and plot the coefficient estimates 
preserve
use "plotdata_1.dta", clear
append using "plotdata_2.dta"

gen varnum = _n
label define varnum_lbl 1 "treated#nr_samefather_d" 2 "treated#avg_ageofothers_d"
label values varnum varnum_lbl

twoway (scatter varnum coef, msymbol(O)) ///
       (rcap lb ub varnum, horizontal), ///
       ylabel(0(1)3 0 " " 1 `" "Treatment x" "Nr Same Father" "' 2 `" "Treatment x" "Mean Age of Others" "' 3 " ", notick angle(0) gstyle(miniscule)) ///
	   xlabel(-0.5(0.5)1) ///
	   yscale(reverse) ///
       xline(0, lpattern(dash) lcolor(black)) ///
	   xtitle("Coefficient on interaction") ///
	   ytitle("Interacted variable") ///
	   legend(off) ///
	   scheme(s2mono) graphregion(color(white))
graph export "Output/Fig3_Heterogeneity_coefplot_interactions_w_boottest.eps", replace
graph export "Output/Fig3_Heterogeneity_coefplot_interactions_w_boottest.png", replace

*Delete the temporary data
erase "plotdata_1.dta"
erase "plotdata_2.dta"

restore

********************************************************************************

*5. Extended Data Table 1

*Regress indicator for remaining unmarried on treatment with four different control sets

*define control sets
local ctrl_list `" "unmarried_base unmarried_base_miss" "unmarried_base unmarried_base_miss i.pair_id" "unmarried_base unmarried_base_miss i.pair_id $controls" "unmarried_base unmarried_base_miss i.pair_id $controls_c" "'

eststo clear

*Create dataframe to store values for use in export
gen var=""
gen p=.
gen nrpair=.

local x = 1
foreach var of varlist unmarried unmarried unmarried unmarried {
	replace var = "`var'" in `x'
	
	*run regression with each control set
	local ctrl : word `x' of `ctrl_list'
	eststo `var'`x' : reg `var' treated `ctrl', cluster(community_id)
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'
	
	*define table notes depending on specification
	if `x' == 1 {
		estadd local pairid "No"
	}
	if `x' > 1 {
		estadd local pairid "Yes"
	}
	if `x' <=2 {
		estadd local indiv "No"
	}
	if `x' == 3 {
		estadd local indiv "Unbalanced"
	}
	if `x' == 4 {
		estadd local indiv "All"
	}
	
	*store control mean
	summ `var' if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store number of pairs in regression
	unique pair_id if `var'<.
	estadd scalar nrpair = `r(unique)'

	*increase local to move to next row
	local x = `x'+1
}

*run boottest to get wild bootstrap cluster p values, store in dataframe
gen boot_p=.
local x=1
foreach var of varlist unmarried unmarried unmarried unmarried {
	local ctrl : word `x' of `ctrl_list'
	reg `var' treated `ctrl', cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var of varlist unmarried unmarried unmarried unmarried {
	mkmat boot_p in `x', matrix(boot)
	mat colnames boot = treated
	estadd matrix boot : `var'`x'
	local x=`x'+1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

*store results in matrix for export
local x=1
foreach var of varlist unmarried unmarried unmarried unmarried {
	mkmat q in `x', matrix(qval)
	mat colnames qval = treated
	estadd matrix qval : `var'`x'
	local x=`x'+1
}

*estimate randomization inference p-values, store in dataframe
gen ri_p=.
local x=1
foreach var of varlist unmarried unmarried unmarried unmarried {
	local ctrl : word `x' of `ctrl_list'
	ritest treated _b[treated], reps(1000) cluster(community_id) strata(pair_id) seed(122334455): regress `var' treated `ctrl', cluster(community_id)
	replace ri_p = r(p)[1,1] in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in unmarried unmarried unmarried unmarried {
	mkmat ri_p in `x', matrix(ri)
	mat colnames ri = treated
	estadd matrix ri : `var'`x'
	local x=`x'+1
}

*export results to LaTeX
esttab using "Output/Exdat_marriage_results.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs cells(b(fmt(3)) se(par fmt(3)) boot(par([ ]) pvalue(boot) star) qval(par(\{ \}) pvalue(qval) star) ri(par($\langle$ $\rangle$) pvalue(ri) star)) keep(treated) ///
	collab(none) nonotes nomtitles mgroups("Unmarried (d)", ///
	pattern(1 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
	stats(ctrl N nrpair pairid indiv, fmt(3 0 0 0 0) labels("Control Mean" "Observations" "Number of Pairs" "Pair FE" "Covariates")) ///
	title("Effect of Pathways on Marital Status\label{tab:final}") ///
	substitute({l}{\footnotesize {p{.75\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table presents treatment effect estimates for the Pathways intervention on whether a girl is unmarried at endline. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include the baseline measure of the outcome; iii) unbalanced controls includes all variables unbalanced at baseline, demeaned and included independently and interacted with treatment status; iv) all controls includes all variables listed in Extended Data Table 4 from the baseline survey, demeaned and included independently and interacted with treatment status; v) pair fixed effects indicates the inclsion of pair fixed effects in the specification; vi) robust standard errors, reported in parenthesis, are estimated clustered at the community level; vii) wild cluster bootstrap p-values (two-sided) are reported in brackets; viii) q-values are reported in braces based on two-sided tests adjusted to control for the false discovery rate (or the proportion of Type I errors); vix) randomization inference p-values, clustered by community and stratified by pair, are calcuated with 1,000 replications and reported in curly brackets; and x) control means are computed using endline data for the control group in the estimation sample.")

*drop temporary storage variables
drop var-ri_p

********************************************************************************

*6. Extended Data Table 2

*Estimate treatment effects on secondary outcomes

eststo clear

*Create dataframe to store values for use in export
gen var=""
gen p=.
local x = 1

foreach var of varlist attend_school social_index_z self_index_z norm_womanpos_z marriage_adv_index_z {
	replace var = "`var'" in `x'
	
	*run regression on each outcome
	eststo `var' : reg `var' treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'
	
	*store table notes
	estadd local pairid "Yes"
	estadd local indiv "Yes"
	
	*store control mean
	summ `var' if treated==0
	estadd scalar ctrl=`r(mean)'

	*store number of pairs in regression
	unique pair_id if `var'<.
	estadd scalar nrpair = `r(unique)'
	
	*increase local to move to next row
	local x = `x'+1
}

*run boottest to get wild bootstrap cluster p values, store in dataframe
gen boot_p=.
local x=1
foreach var of varlist attend_school social_index_z self_index_z norm_womanpos_z marriage_adv_index_z  {
	reg `var' treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in attend_school social_index_z self_index_z norm_womanpos_z marriage_adv_index_z {
	mkmat boot_p in `x', matrix(boot)
	mat colnames boot = treated
	estadd matrix boot : `var'
	local x=`x'+1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

*store results in matrix for export
local x=1
foreach var in attend_school social_index_z self_index_z norm_womanpos_z marriage_adv_index_z {
	mkmat q in `x', matrix(qval)
	mat colnames qval = treated
	estadd matrix qval : `var'
	local x=`x'+1
}

*export results to LaTeX
esttab using "Output/Exdat_other_results.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs cells(b(fmt(3)) se(par fmt(3)) boot(par([ ]) pvalue(boot) star) qval(par(\{ \}) pvalue(qval) star)) keep(treated) ///
	collab(none) mtitles("\shortstack{In School\\(d)}"  "\shortstack{Social \\ Support \\Index (sd)}"  "\shortstack{Self- \\ Perception\\ Index (sd)}" "\shortstack{Gender \\ Eq Beliefs \\ Index (sd)}" "\shortstack{Marriage \\ Self-Advocacy \\ Index (sd)}") ///
	stats(ctrl N nrpair pairid indiv, fmt(3 0 0 0 0) labels("Control Mean" "Observations" "Number of Pairs" "Pair FE" "Covariates")) ///
	title("Effects on Educational, Social and Attitudinal Outcomes") ///
	substitute({l}{\footnotesize {p{.87\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table presents treatment effect estimates for the Pathways intervention on the behavior and attitudes of adolescent girls, specifically whether the girl self-reports being enrolled in school at endline, own perceptions of social and familial support, self-perception, gender equitable beliefs, and a measurement of self-advocacy for marriage timing. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include pair fixed effects, the baseline measure of the outcome and controls for all variables unbalanced at baseline; iii) robust standard errors, reported in parenthesis, are estimated clustered at the community level; iv) wild cluster bootstrap p-values (two-sided) are reported in brackets; v) q-values are reported in braces based on two-sided tests adjusted to control for the false discovery rate (or the proportion of Type I errors); and vi) control means are computed using endline data for the control group in the estimation sample.")
drop var p q boot_p
eststo clear

********************************************************************************	

*7. Extended Data Table 3

*Estimate sibling effects

eststo clear

*Create dataframe to store values for use in export
gen var=""
gen p=.
local x = 1
foreach var of varlist ysis_currschool ysis_shareschool ybro_currschool ybro_shareschool {
	replace var = "`var'" in `x'
	
	*run regression on each outcome
	eststo `var' : reg `var' treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	
	*store results for export
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'
	
	*store table notes
	estadd local pairid "Yes"
	estadd local indiv "Yes"
	
	*store control mean
	summ `var' if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store number of pairs in regression
	unique pair_id if `var'<.
	estadd scalar nrpair = `r(unique)'

	*increase local to move to next row
	local x = `x'+1
}

*run boottest to get wild bootstrap cluster p values, store in dataframe
gen boot_p=.
local x=1
foreach var of varlist ysis_currschool ysis_shareschool ybro_currschool ybro_shareschool {
	reg `var' treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in ysis_currschool ysis_shareschool ybro_currschool ybro_shareschool {
	mkmat boot_p in `x', matrix(boot)
	mat colnames boot = treated
	estadd matrix boot : `var'
	local x=`x'+1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

*store results in matrix for export
local x=1
foreach var in ysis_currschool ysis_shareschool ybro_currschool ybro_shareschool {
	mkmat q in `x', matrix(qval)
	mat colnames qval = treated
	estadd matrix qval : `var'
	local x=`x'+1
}

*export results to LaTeX
esttab using "Output/Exdat_siblingschool.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs cells(b(fmt(3)) se(par fmt(3)) boot(par([ ]) pvalue(boot) star) qval(par(\{ \}) pvalue(qval) star)) keep(treated) ///
	collab(none) mtitles("Any in School" "Share in School" "Any in School" "Share in School") ///
mgroups("Younger Sisters" "Younger Brothers", ///
	pattern(1 0 1 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
	stats(ctrl N nrpair pairid indiv, fmt(3 0 0 0 0) labels("Control Mean" "Observations" "Number of Pairs" "Pair FE" "Covariates")) ///
	title("Effects on Younger Siblings' Schooling") ///
	substitute({l}{\footnotesize {p{.88\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table presents treatment effect estimates for the Pathways intervention on the educational outcomes of younger siblings, specifically whether any younger sisters are in school, the share of younger sisters in school, whether any younger brothers are in school, and the share of younger brothers in school. Note that in a handful of cases, girls did not know the precise number of younger brothers and sisters they have, but did report that one or more attend school. This data is included in the binary but not the continuous measure. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include pair fixed effects, the baseline measure of the outcome and controls for all variables unbalanced at baseline; iii) robust standard errors, reported in parenthesis, are estimated clustered at the community level; iv) wild cluster bootstrap p-values (two-sided) are reported in brackets; v) q-values are reported in braces based on two-sided tests adjusted to control for the false discovery rate (or the proportion of Type I errors); and vi) control means are computed using endline data for the control group in the estimation sample.")
		
drop var-q
eststo clear

********************************************************************************	

*8. Extended Data Figure 2

*create new version of indicator for being unmarried
gen unmarried_swap=unmarried

*generate counts of unmarried individuals in control and treatment
count if treated==0 & unmarried==1
local ctrl_ct = `r(N)'
count if treated==1 & unmarried==1
local treat_ct = `r(N)'

*calcaulate difference and intervals
disp `treat_ct' - `ctrl_ct'
disp (`treat_ct' - `ctrl_ct')/(`treat_ct')
disp (((`treat_ct' - `ctrl_ct')/(`treat_ct'))*100)/26

*create random sort order
gen rand=runiform()
gsort -treated -unmarried rand

eststo clear 
*start with initial regression, meaning zero swaps
eststo est0: reg unmarried treated i.pair_id, cluster(community_id) 

preserve

label var treated "0"

*swap thirteen girls at a time, looping over results
local x = 1
forvalues y=1(13)338 {
	local z = `y' + 13
	replace unmarried_swap = 0 in `y'/`z' // replace next 13 observations from 1 (unmarried) to 0 (married)

	*create new treatment dummy
	gen treated`x'=treated
	*calculate swapped percent
	local perc = round(`x'*3.15,1)
	label var treated`x' "`perc'"
	
	*run regression with swapped outcome
	eststo est`x': reg unmarried_swap treated`x' i.pair_id, cluster(community_id)
	local x = `x' + 1
}

*plot results
coefplot est*, keep(treated*) legend(off) scheme(s2mono) vertical msymbol(circle) mcolor(gs0) mlcolor(gs0) ciopts(lcolor(gs0)) graphregion(color(white)) b1title("Percent of unmarried treatment girls with swapped response") ytitle("Treatment effect") yline(0) title("Assessing Enumerator Demand") ///
	note("Note: This figure presents the results of an exercise to assess the potential contribution of" ///
	"enumerator demand to the estimation of treatment effects on remaining unmarried. First, we" ///
	"randomly sort girls within treatment and marital status. Then, we swap the response of 4% of" ///
	"unmarried girls in the treatment group. We then regress whether a girl is unmarried on a" ///
	"treatment indicator and pair fixed effects, clustering standard errors at the community level." ///
	"We then repeat the exercise (swapping responses of 4% and re-running the regression) until" ///
	"the share of girls unmarried equalizes between treatment and control. Each object in the graph" ///
	"has a circle at the point estimate from a regression of whether a girl is unmarried on treatment," ///
	"where the share of unmarried girls with swapped responses corresponds to the horizontal axis;" ///
	"error bars represent 95% confidence intervals around the estimate. The sample includes the 1,056" ///
	"girls who answered questions on marital status during the study endline.")
graph export "Output/Exdat_bound_exercise.eps", replace
graph export "Output/Exdat_bound_exercise.png", replace

restore

********************************************************************************

*9. Extended Data Table 4

*Check baseline balance

*Build dataframe
gen colname=""
gen coln=.
gen col0=.
gen col1=.
gen colp=.
gen colbootp=.

*List of baseline balance variables
local list_of_vars unmarried attend_school ever_school ysis_currschool self_index_z social_index_z nr_friends norm_womanpos_z timespent_school terms_current completed_years ysis_currschool ysis_shareschool ybro_currschool ybro_shareschool want_highered agetomarry marriage_adv_index_z anywork_lastyear age socioecon_index_z no_endline

local num : list sizeof local(list_of_vars)

*Loop over each variable, extracting relevant data
local x=1
foreach var in `list_of_vars' {
	*store name
	local vlabel : var label `var'_base
	replace colname="`vlabel'" in `x'
	
	*store non-missing observations
	count if `var'_base_miss == 0 
	replace coln = `r(N)' in `x'
	
	*calcuate and store treatment and control mean
	forvalues y=0/1 {
		summ `var'_base if treated==`y' & pair_id<. & `var'_base_miss==0
		replace col`y'=`r(mean)' in `x'
	}
	
	*regress variable on treatment status, store resulting conventional p-value 
	reg `var'_base treated if `var'_base_miss==0, cluster(community_id)
	test treated
	replace colp=r(p) in `x'
	
	*regress variable on treatment status, store resulting wild bootstrap p-value 
	reg `var'_base treated if `var'_base_miss==0, cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace colbootp = r(p) in `x'

	local x=`x'+1
}

*label columns
label var colname "Variable"
label var coln "Obs"
label var col0 "Control"
label var col1 "Treated"
label var colp "p-value"
label var colbootp "Bootstrap p-value"

*fix display to round appropriately
replace col0=round(col0,.001)
replace col1=round(col1,.001)

*round very small values to 0 to avoid displaying 'e'
replace col0=0.00 if col0<.01
replace col1=0.00 if col1<.01
tostring col0 col1, format(%5.0g) replace force
foreach var of varlist col0 col1 {
	replace `var'="" if `var'=="."
}

tostring colp colbootp, format(%9.3f) replace force

local num : list sizeof local(list_of_vars)

*export results to LaTeX
texsave colname coln col0 col1 colp colbootp using "Output/Exdat_balance.tex" in 1/`num', ///
	varlabels frag replace location(h)  width(\linewidth) title("Balance Table \label{tab:bal}") ///
	nofix hlines(0 `num') ///
	footnote("Note: This table presents baseline balance for the Pathways intervention. Each row represents a regression of the variable indicated at baseline on a treatment indicator. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) p-values are calculated based on two-sided tests with robust standard errors clustered at the community level; iii) wild cluster bootstrap p-values (two-sided) are reported in brackets; and iv) indices, where indicated, are expressed in standard deviation units, demeaned relative to the control group.")
drop colname - colbootp

********************************************************************************

*10. Extended Data Table 5

*Look at correlation between marital status and baseline covariates among the control group

eststo clear

*Run regressions

eststo: reg unmarried timespent_school_base timespent_school_base_miss if treated==0, vce(robust)
eststo: reg unmarried self_index_z_base self_index_z_base_miss if treated==0, vce(robust)
eststo: reg unmarried social_index_z_base social_index_z_base_miss if treated==0, vce(robust)
eststo: reg unmarried socioecon_index_z_base socioecon_index_z_base_miss if treated==0, vce(robust)
eststo: reg unmarried timespent_school_base timespent_school_base_miss self_index_z_base ///
	self_index_z_base_miss social_index_z_base social_index_z_base_miss ///
	socioecon_index_z_base socioecon_index_z_base_miss if treated==0, vce(robust)

*Export results to LaTeX
esttab using "Output/Exdat_imbalance.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs keep(timespent_school_base self_index_z_base social_index_z_base socioecon_index_z_base) title("Marital Status Prediction by Imbalanced Covariates\label{tab:imbalance}") ///
	collab(none) nonotes nomtitles mgroups("Unmarried (d)", ///
	pattern(1 0 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
	stats(N, fmt(0) labels("Observations")) ///
	substitute({l}{\footnotesize {p{.75\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table regressses unbalanced baseline covariates on marital status in the control group at endline. In this analysis: i) robust standard errors are reported in parenthesis.")
eststo clear

********************************************************************************

*11. Extended Data Table 6

*Estimate heterogeneous tretament effects on indicator for remaining unmarried

eststo clear

*Create dataframe to store values for use in export
gen var=""
gen p=.
gen p_int=.

*Loop over interacted variables
local x = 1
foreach var of varlist age_base_d ever_school_base_d socioecon_index_z_base_d want_highered_base_d norm_womanpos_z_base_d {
	
	*Generate interaction term
	gen t_x_`var' = treated*`var'
	local lbl : variable label `var'
	label variable t_x_`var' "... x `lbl'"
	
	*Regress on treatment, interaction term and interacted variable
	eststo `var'_a: reg unmarried treated `var' t_x_`var' unmarried_base unmarried_base_miss i.pair_id $controls

	*store p-values of treatment and interaction teerm for export
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'
	replace p_int = (2 * ttail(e(df_r), abs(_b[t_x_`var']/_se[t_x_`var']))) in `x'
	
	*add table notes
	estadd local pairid "Yes"
	estadd local indiv "Unbalanced"

	*store control mean
	summ unmarried if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store number of pairs in regression
	unique pair_id if unmarried<.
	estadd scalar nrpair = `r(unique)'

	*increase local to move to next row
	local x = `x' + 1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values for treatment and interaction
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

gen pvalue=p_int
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q_int

*store results in matrix for export
local x=1
foreach var of varlist age_base_d ever_school_base_d socioecon_index_z_base_d want_highered_base_d norm_womanpos_z_base_d {
	matrix qval = (q[`x'], q_int[`x'])
	mat colnames qval = treated t_x_`var'
	estadd matrix qval : `var'_a

	local x=`x'+1
}

*run boottest to get wild bootstrap cluster p values for treatment and interaction, store in dataframe
gen boot_p=.
gen boot_p_int=.
local x=1
foreach var of varlist age_base_d ever_school_base_d socioecon_index_z_base_d want_highered_base_d norm_womanpos_z_base_d {
	regress unmarried c.treated#c.`var' c.`var' c.treated unmarried_base i.pair_id unmarried_base_miss $controls , cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	boottest c.treated#c.`var', nograph boottype(wild) seed(122334455)
	replace boot_p_int = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in age_base_d ever_school_base_d socioecon_index_z_base_d want_highered_base_d norm_womanpos_z_base_d {
	mkmat boot_p boot_p_int in `x', matrix(boot)
	mat colnames boot = treated t_x_`var'
	estadd matrix boot : `var'_a
	local x=`x'+1
}

*export results to LaTeX 
esttab using "Output/Exdat_varint.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs cells(b(fmt(3)) se(par fmt(3)) boot(par([ ]) pvalue(boot) star) qval(par(\{ \}) pvalue(qval) star)) keep(treated t_x_age_base_d t_x_ever_school_base_d t_x_socioecon_index_z_base_d t_x_want_highered_base_d t_x_norm_womanpos_z_base_d) ///
	collab(none) nonotes nomtitles mgroups("Unmarried (d)", ///
	pattern(1 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
	stats(ctrl N nrpair pairid indiv, fmt(3 0 0 0 0) labels("Control Mean" "Observations" "Number of Pairs" "Pair FE" "Covariates")) ///
	title("Effect Heterogeneity on Marital Status") ///
	substitute({l}{\footnotesize {p{\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
	addnotes("\emph{Note}: This table presents heterogeneous treatment effect estimates for the Pathways intervention on whether a girl is unmarried at endline by age at baseline, whether the girl had ever attended school at baseline, family socio-economic status at baseline, baseline schooling aspirations, and baseline gender equitable beliefs. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include the baseline measure of the outcome; iii) unbalanced controls includes all variables unbalanced at baseline, demeaned and included independently and interacted with treatment status; iv) pair fixed effects indicates the inclsion of pair fixed effects in the specification; v) robust standard errors, reported in parenthesis, are estimated clustered at the community level; vi) wild cluster bootstrap p-values (two-sided) are reported in brackets; vii) q-values are reported in braces based on two-sided tests adjusted to control for the false discovery rate (or the proportion of Type I errors); and viii) control means are computed using endline data for the control group in the estimation sample.")

eststo clear

*drop temporary storage variables
drop var-boot_p_int

********************************************************************************

*12. Extended Data Figure 3

*Collapse data into bar graph visualization of remaining unmarried rates by treatment status for each age in the sample at baseline

preserve
collapse (mean) unmarried (semean) se_unmarried = unmarried, by(treated age_base)

drop if age_base==0 | age_base==17 // drop missing age and drop those age 17, as there are too few observations in sample
drop if treated==.

gsort age_base -treated // sort to make arranging in graph easier

*calculate +/- se
gen unmarried_upper = unmarried + se_unmarried
gen unmarried_lower = unmarried - se_unmarried

*build dataframe to allow for space between age groups
gen bt_graph = _n 
replace bt_graph = bt_graph+.5 if bt_graph>2
replace bt_graph = bt_graph+.5 if bt_graph>4.5
replace bt_graph = bt_graph+.5 if bt_graph>7
replace bt_graph = bt_graph+.5 if bt_graph>9.5
replace bt_graph = bt_graph+.5 if bt_graph>12

*plot two bars per age, one for treatment, one for control
twoway 	(bar unmarried bt_graph if bt_graph==1, color(gs4)) ///
		(bar unmarried bt_graph if bt_graph==2, color(gs12)) ///
		(bar unmarried bt_graph if bt_graph==3.5, color(gs4)) ///
		(bar unmarried bt_graph if bt_graph==4.5, color(gs12)) ///
		(bar unmarried bt_graph if bt_graph==6, color(gs4)) ///
		(bar unmarried bt_graph if bt_graph==7, color(gs12)) ///
		(bar unmarried bt_graph if bt_graph==8.5, color(gs4)) ///
		(bar unmarried bt_graph if bt_graph==9.5, color(gs12)) ///
		(bar unmarried bt_graph if bt_graph==11, color(gs4)) ///
		(bar unmarried bt_graph if bt_graph==12, color(gs12)) ///
		(bar unmarried bt_graph if bt_graph==13.5, color(gs4)) ///
		(bar unmarried bt_graph if bt_graph==14.5, color(gs12)) ///
		(rcap unmarried_upper unmarried_lower bt_graph if bt_graph==1 | bt_graph==3.5 | bt_graph==6 | bt_graph==8.5 | bt_graph==11 | bt_graph==13.5, color(gs4)) ///
	    (rcap unmarried_upper unmarried_lower bt_graph if bt_graph==2 | bt_graph==4.5 | bt_graph==7 | bt_graph==9.5 | bt_graph==21 | bt_graph==14.5, color(gs12)) ///
		(scatter unmarried bt_graph, msymbol(circle) mcolor(black) msize(vsmall)), ///
    ylabel(0(0.1)1, angle(0)) ///
    legend(off) ///
    ytitle("Percent of Girls Not Married") ///
    scheme(s2mono) ///
    graphregion(color(white)) ///
    xtitle("Baseline Age") xlabel(1.5 "11" 4 "12" 6.5 "13" 9 "14" 11.5 "15" 14 "16", notick) ///
    ylabel(0 "0%" .2 "20%" .4 "40%" .6 "60%" .8 "80%" 1 "100%", angle(0) gmin) ///
    legend(row(1) order(1 "Pathways" 2 "Control")) title("Treatment Effects by Age") ///
	note("Note: This figure presents rates of remaining unmarried among adolescent girls measured two" ///
	"years after program start. Girls are pooled by baseline age within treatment and control" ///
	"communities. The analysis includes the 1,051 girls who answered questions on marital status" ///
	"during the study endline and reported being between ages 11 and 16 at baseline; it excludes" ///
	"five girls who reported being age 17 at baseline. Data are presented as mean +/- s.e.m.")
graph export "Output/ExDat_unmarried_byage.eps", replace
graph export "Output/ExDat_unmarried_byage.png", replace

restore

********************************************************************************

*13. Extended Data Table 7

*Implement sequential g-estimation in three ways:
	*on school attendance, direct only
	*on school attendance, direct and interacted with treatment
	*on school attendance, direct, interacted with treatment, and interacted with baseline values of other mediators

eststo clear

*direct only
capture program drop deboot_sch_raw

*define two-step procedure in program 
program deboot_sch_raw, rclass
	cap drop unmarried_dmd
	
	*regress indicator for remaining unmarried on mediator
	reg unmarried treated attend_school i.pair_id unmarried_base unmarried_base_miss $controls, cluster(community_id)
	
	*generate demediated version of outcome
	foreach x in attend_school {
		local b_`x' = _b[`x']
	}
	gen unmarried_dmd = unmarried - `b_attend_school'*attend_school
		
	*regress demediated outcome on treatment status and controls
	reg unmarried_dmd treated unmarried_base unmarried_base_miss $controls i.pair_id, cluster(community_id)
end

*bootstrap procedure to estimate standard errors
bootstrap, reps(1000): deboot_sch_raw
	eststo 
	local t = _b[treated]
	
	*store ratio between coefficient and raw estimate
	estadd scalar share= `t'/.664
	
	*store control mean
	summ unmarried if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store table notes
	estadd local pairid "Yes"
	estadd local controls "School"
	estadd local interact_t "No"
	estadd local interact_c "No"

*direct and interacted with treatment
capture program drop deboot_sch_int
*define two-step procedure in program 
program deboot_sch_int, rclass
	cap drop unmarried_dmd_int
	
	*regress indicator for remaining unmarried on mediator and mediator interacted with treatment
	reg unmarried treated attend_school treated_school i.pair_id unmarried_base unmarried_base_miss $controls, cluster(community_id)
	
	*generate demediated version of outcome
	foreach x in attend_school treated_school {
		local b_`x' = _b[`x']
	}
	gen unmarried_dmd_int = unmarried - `b_attend_school'*attend_school - `b_treated_school'*treated_school
	
	*regress demediated outcome on treatment status and controls
	reg unmarried_dmd_int treated unmarried_base unmarried_base_miss $controls i.pair_id, cluster(community_id)
end

*bootstrap procedure to estimate standard errors
bootstrap, reps(1000): deboot_sch_int
	eststo 
	local t = _b[treated]

	*store ratio between coefficient and raw estimate
	estadd scalar share= `t'/.664
	
	*store control mean
	summ unmarried if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store table notes
	estadd local pairid "Yes"
	estadd local controls "School"
	estadd local interact_t "Yes"
	estadd local interact_c "No"

*direct, interacted with treatment, and interacted with baseline values of other mediators
capture program drop deboot_all_int

*define two-step procedure in program 
program deboot_all_int, rclass
	cap drop unmarried_dmd
	
	*regress indicator for remaining unmarried on mediator, mediator interacted with treatment, mediator interacted with baseline values of other mediators
	reg unmarried treated attend_school treated_school attend_school_norm attend_school_self attend_school_social norm_womanpos_z_base norm_womanpos_z_base_miss self_index_z_base self_index_z_base_miss social_index_z_base social_index_z_base_miss i.pair_id unmarried_base unmarried_base_miss $controls, cluster(community_id)
	
	*generate demediated version of outcome
	foreach x in attend_school treated_school attend_school_norm attend_school_self attend_school_social {
		local b_`x' = _b[`x']
	}
	gen unmarried_dmd = unmarried - `b_attend_school'*attend_school - `b_treated_school'*treated_school - `b_attend_school_self'*attend_school_self - `b_attend_school_social'*attend_school_social 

	*regress demediated outcome on treatment status and controls
	reg unmarried_dmd treated unmarried_base unmarried_base_miss $controls i.pair_id, cluster(community_id)
end

*bootstrap procedure to estimate standard errors
bootstrap, reps(1000): deboot_all_int
	eststo
	local t = _b[treated]
	
	*store ratio between coefficient and raw estimate
	estadd scalar share= `t'/.664
	
	*store control mean
	summ unmarried if treated==0
	estadd scalar ctrl=`r(mean)'

	*store table notes
	estadd local pairid "Yes"
	estadd local controls "Multiple"
	estadd local interact_t "Yes"
	estadd local interact_c "Yes"

*export results to LaTeX
esttab using "Output/Exdat_seqg_mediation.tex", ///
	varwidth(32) cells(b(fmt(3)) se(par fmt(3)) p(par([ ]) fmt(3) star)) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs keep(treated) nonotes ///
	collab(none) nonotes nomtitles mgroups("Unmarried (d)", ///
	pattern(1 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
	stats(ctrl N pairid controls interact_t interact_c share, fmt(3 0 0 0 0 3) labels("Control Mean" "Observations" "Pair FE" "Mediators" "Interacted with Treatment" "Interacted with Confounders" "Share of Treatment Effect")) ///
	title("Mediated Effects of Pathways\label{tab:final_seqg}") ///
	substitute({l}{\footnotesize {p{.65\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table presents mediated treatment effect estimates for the Pathways intervention marriage outcomes using the sequential \$g\$-estimation method and treating school attendance as a mediator. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include pair fixed effects, the baseline measure of the outcome, and controls for all variables unbalanced at baseline, demeaned and included independently and interacted with treatment status; iii) bootstrap standard errors, estimated using 1,000 repetitions, are reported in parenthesis; iv) p-values based on two-sided tests using bootstrap-estimated standard errors are reported in brackets; v) control means are computed using endline data for the control group in the estimation sample; vi) potential confounders are social support, self-perception and gender equitable beliefs, as measured at baseline; vii) share of treatment effect compares the estimated coefficient to the average treatment effect estimated in column (3) of Extended Data Table 1.")

eststo clear

********************************************************************************

*14. Extended Data Table 8

*Estimate heterogeneneous effects by community characteristics

*create blank versions to make sure I can get the right sample for demeaning
drop avg_ageofothers_d avg_age_treated
gen avg_ageofothers_d=.
label var avg_ageofothers_d "Mean Age of Group Mates"
gen avg_age_treated=.
label var avg_age_treated "Int x Average Age of Other Group Mates"

eststo clear

*First, look at number of siblings eligible for treatment

*Create dataframe to store values for use in export
gen var=""
gen p=.
gen p_int=.

local x = 1

foreach var of varlist unmarried {
	replace var = "`var'" in `x'
	
	*regress outcome on treatment interacted with share of treated siblings

	eststo `var'_b: reg `var' treated nr_sibgirls_d nr_sibgirls_treated nr_samefather_d nr_samefather_treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	
	*store results for export
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'
	replace p_int = (2 * ttail(e(df_r), abs(_b[nr_samefather_treated]/_se[nr_samefather_treated]))) in `x'

	*store table notes
	estadd local pairid "Yes"
	estadd local indiv "Yes"
	
	*store control mean
	summ `var' if treated==0
	estadd scalar ctrl=`r(mean)'

	*store number of pairs in regression
	unique pair_id if unmarried<.
	estadd scalar nrpair = `r(unique)'
	
	*increase local to move to next row
	local x = `x' + 1
}

*run boottest to get wild bootstrap cluster p values for coefficient on treatment and interaction, store in dataframe
gen boot_p=.
gen boot_p_int=.
local x=1
foreach var of varlist unmarried {
	regress `var' treated nr_sibgirls_d nr_sibgirls_treated nr_samefather_d c.treated#c.nr_samefather_d `var'_base i.pair_id `var'_base_miss $controls, cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	boottest c.treated#c.nr_samefather_d, nograph boottype(wild) seed(122334455)
	replace boot_p_int = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in unmarried {
	mkmat boot_p boot_p_int in `x', matrix(boot)
	mat colnames boot = treated nr_samefather_treated
	estadd matrix boot : `var'_b
	local x=`x'+1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values for treatment and interaction
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

gen pvalue=p_int
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q_int

*store results in matrix for export
local x=1
foreach var of varlist unmarried {
	matrix qval = (q[`x'], q_int[`x'])
	mat colnames qval = treated nr_samefather_treated
	estadd matrix qval : `var'_b
	local x=`x'+1
}

*drop internal dataframe
drop var-q_int

*Create dataframe to store values for use in export
gen var=""
gen p=.
gen p_int=.

*First, look at average age of others

local x = 1

foreach var of varlist unmarried {
	summ avg_ageofothers_base if `var'<. & treated==0
	replace avg_ageofothers_d = avg_ageofothers_base - `r(mean)' // recreating to keep same ATE
	replace avg_age_treated = avg_ageofothers_d*treated
	
	replace var = "`var'" in `x'
	
	*regress outcome on treated interacted with average age of others
	eststo `var'_a: reg `var' treated avg_ageofothers_d avg_age_treated `var'_base i.pair_id `var'_base_miss  $controls age_base age_base_miss, cluster(community_id)
	
	*store results for export
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'
	replace p_int = (2 * ttail(e(df_r), abs(_b[avg_age_treated]/_se[avg_age_treated]))) in `x'
	
	*store table notes
	estadd local pairid "Yes"
	estadd local indiv "Yes"
	
	*store control mean
	summ `var' if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store number of pairs in regression
	unique pair_id if `var'<.
	estadd scalar nrpair = `r(unique)'
	
	*increase local to move to next row
	local x = `x' + 1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

gen pvalue=p_int
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q_int

*store results in matrix for export
local x=1
foreach var of varlist unmarried {
	matrix qval = (q[`x'], q_int[`x'])
	mat colnames qval = treated avg_age_treated
	estadd matrix qval : `var'_a

	local x=`x'+1
}

*run boottest to get wild bootstrap cluster p values, store in dataframe
gen boot_p=.
gen boot_p_int=.
local x=1
foreach var of varlist unmarried {
	regress `var' c.treated#c.avg_ageofothers_d c.avg_ageofothers_d c.treated `var'_base i.pair_id `var'_base_miss $controls age_base age_base_miss, cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	boottest c.treated#c.avg_ageofothers_d, nograph boottype(wild) seed(122334455)
	replace boot_p_int = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in unmarried {
	mkmat boot_p boot_p_int in `x', matrix(boot)
	mat colnames boot = treated avg_age_treated
	estadd matrix boot : `var'_a
	local x=`x'+1
}

*export results to LaTeX
esttab using "Output/Exdat_commint.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
compress noobs cells(b(fmt(3)) se(par fmt(3)) boot(par([ ]) pvalue(boot) star) qval(par(\{ \}) pvalue(qval) star)) keep(treated avg_age_treated nr_samefather_treated) ///
	collab(none) nomtitles mgroups("Unmarried (d)", ///
	pattern(1 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
	stats(ctrl N nrpair pairid indiv, fmt(3 0 0 0 0) labels("Control Mean" "Observations" "Number of Pairs" "Pair FE" "Covariates")) ///
	title("Effects by Community Heterogeneity\label{tab:final_commint}") ///
	substitute({l}{\footnotesize {p{.75\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table presents treatment effect estimates for the Pathways intervention on whether a girl is unmarried at endline, assessing interaction effects with the number of girls with the same father's name enrolled in the program and with the mean age of participants' group mates. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include pair fixed effects, the baseline measure of the outcome and controls for all variables unbalanced at baseline, plus either the (demeaned) age of the respondent at baseline or the (demeaned) number of girl siblings the girl has and the number of girl siblings interacted with treatment; iii) robust standard errors, reported in parenthesis, are estimated clustered at the community level; iv) wild cluster bootstrap p-values (two-sided) are reported in brackets; v) q-values are reported in braces based on two-sided tests adjusted to control for the false discovery rate (or the proportion of Type I errors); and vi) control means are computed using endline data for the control group in the estimation sample.")
eststo clear

drop var-boot_p_int 

********************************************************************************

*15. Other Miscellaneous Statistics

*Other baseline statistics:

summ discussmarriage_base discussmarr_recself_base

*Other endline statistics:

summ attend_school if unmarried==0 & treated==0
summ attend_school if unmarried==0 & treated==1

********************************************************************************

*16. Supplementary Information Figure 1

*Loop over each pair to collapse data into bar graph visualization of remaining unmarried rates by treatment status
forvalues x=1/9 {
	preserve
	keep if pair_id==`x'
	collapse (mean) unmarried (semean) se_unmarried = unmarried, by(treated)
	
	*Calculate +/- standard error
	gen unmarried_upper = unmarried + se_unmarried
	gen unmarried_lower = unmarried - se_unmarried

	*Graph results
	twoway (bar unmarried treated, barwidth(0.4) ///
		ylabel(0(0.1)1, angle(0)) ///
		legend(off) ///
		title("Pair `x'") ///
		scheme(s2mono) ///
		graphregion(color(white)) ///
		xtitle("") ///
		ylabel(0 "0%" .2 "20%" .4 "40%" .6 "60%" .8 "80%" 1 "100%", angle(0) gmin) ///
		xlabel(0 "Control" 1 "Pathways", noticks) ///
		ytitle("Pct Girls Not Married")) ///
		(rcap unmarried_lower unmarried_upper treated, lwidth(thick)) ///
		(scatter unmarried treated, msize(vsmall) mcolor(black) msymbol(circle))
	graph save "pair_`x'_unmarried_rates_endline.gph", replace
	restore	
}

*Combine graph for each pair into a single figure
graph combine "pair_1_unmarried_rates_endline.gph" "pair_2_unmarried_rates_endline.gph" "pair_3_unmarried_rates_endline.gph" ///
              "pair_4_unmarried_rates_endline.gph" "pair_5_unmarried_rates_endline.gph" "pair_6_unmarried_rates_endline.gph" ///
              "pair_7_unmarried_rates_endline.gph" "pair_8_unmarried_rates_endline.gph" "pair_9_unmarried_rates_endline.gph", ///
              rows(3) cols(3) ///
			  
graph export "Output/SI_Fig1_unmarried_rates_endline_by_pair.png", as(png) replace

forvalues x=1/9 {
	erase "pair_`x'_unmarried_rates_endline.gph" 
}

********************************************************************************

*17. Supplementary Information Table 1

*Estimate treatment effects on additional education outcomes

eststo clear

*Create dataframe to store values for use in export
gen var=""
gen p=.
local x = 1

foreach var of varlist timespent_school terms_current completed_years {
	replace var = "`var'" in `x'
	
	*run regression on each outcome
	eststo `var' : reg `var' treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	replace p = (2 * ttail(e(df_r), abs(_b[treated]/_se[treated]))) in `x'

	*store table notes
	estadd local pairid "Yes"
	estadd local indiv "Yes"
	
	*store control mean
	summ `var' if treated==0
	estadd scalar ctrl=`r(mean)'
	
	*store number of pairs in regression
	unique pair_id if `var'<.
	estadd scalar nrpair = `r(unique)'
	
	*increase local to move to next row
	local x = `x'+1
}

*run boottest to get wild bootstrap cluster p values, store in dataframe
gen boot_p=.
local x=1
foreach var of varlist timespent_school terms_current completed_years {
	reg `var' treated `var'_base i.pair_id `var'_base_miss  $controls, cluster(community_id)
	boottest treated, nograph boottype(wild) seed(122334455)
	replace boot_p = `r(p)' in `x'
	local x = `x' + 1
}

*store results in matrix for export
local x=1
foreach var in timespent_school terms_current completed_years {
	mkmat boot_p in `x', matrix(boot)
	mat colnames boot = treated
	estadd matrix boot : `var'
	local x=`x'+1
}

*run fdr_sharpened_qvalues_auto.do to estimate q-values
gen pvalue=p
do "Input/fdr_sharpened_qvalues_auto.do"
drop pvalue-rank
rename bky06_qval q

*store results in matrix for export
local x=1
foreach var in timespent_school terms_current completed_years {
	mkmat q in `x', matrix(qval)
	mat colnames qval = treated
	estadd matrix qval : `var'
	local x=`x'+1
}

*export results to LaTeX
esttab using "Output/SI_Table1_educ_detail.tex", ///
	varwidth(32) b(3) se(3) label replace booktabs star(* 0.10 ** 0.05 *** 0.01)   ///
	compress noobs cells(b(fmt(3)) se(par fmt(3)) boot(par([ ]) pvalue(boot) star) qval(par(\{ \}) pvalue(qval) star)) keep(treated) ///
	collab(none) mtitles("\shortstack{Hours in\\School Daily}"  "\shortstack{Terms Complete \\ in Current Year}"  "\shortstack{Grade Levels \\ Completed}") ///
	stats(ctrl N nrpair pairid indiv, fmt(3 0 0 0 0) labels("Control Mean" "Observations" "Number of Pairs" "Pair FE" "Covariates")) ///
	title("Detailed Effects on Educational Outcomes") ///
	substitute({l}{\footnotesize {p{.7\linewidth}}{\footnotesize \begin{table}[htbp] \begin{table}[h!]) ///
		addnotes("\emph{Note}: This table presents treatment effect estimates for the Pathways intervention on the educational behavior of adolescent girls, specifically hours spent getting to and in school daily, the number of terms the girl has completed in the current year to date, and the total grade levels the girl has completed, either through attendance or testing. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) all specifications include pair fixed effects, the baseline measure of the outcome and controls for all variables unbalanced at baseline; iii) robust standard errors, reported in parenthesis, are estimated clustered at the community level; iv) wild cluster bootstrap p-values (two-sided) are reported in brackets; v) q-values are reported in braces based on two-sided tests adjusted to control for the false discovery rate (or the proportion of Type I errors); and vi) control means are computed using endline data for the control group in the estimation sample.")
drop var p q boot_p
eststo clear


********************************************************************************

*18. Supplementary Information Table 2

*Look for imbalance in non-response

*Create dataframe in stata
gen colname=""
gen coln=.
gen col1=.
gen colp=.

*List primary (main paper) outcomes
local outcomes_pri unmarried attend_school social_index_z self_index_z norm_womanpos_z marriage_adv_index_z 
	
*List secondary (extended data only tables) outcomes
local outcomes_sec timespent_school terms_current completed_years ysis_currschool ysis_shareschool ybro_currschool ybro_shareschool

*Generate a variable indicating if each variable is missing at endline
foreach var of varlist `outcomes_pri' `outcomes_sec' {
	gen `var'_miss = (`var'==.) if no_endline_base<.
}

*Loop over each variable, storing relevant data, in two panels

local x=1
replace colname = "\textbf{Primary outcomes}" in `x' // first panel
local x = `x'+1

foreach var of varlist `outcomes_pri' {
	
	*variable name
	local vlabel : var label `var'
	replace colname="`vlabel'" in `x'
	
	*number of observations
	count if `var' < .
	replace coln = `r(N)' in `x'
	
	*regress missing indicator on treatment and store results
	reg `var'_miss treated, cluster(community_id)
	replace col1 = _b[treated] in `x'
	test treated
	replace colp=r(p) in `x'
	
	*increase local to move to next row
	local x=`x'+1
}

replace colname = "\textbf{Secondary outcomes}" in `x' // second panel
local x = `x'+1

foreach var of varlist `outcomes_sec' {
	
	*variable name
	local vlabel : var label `var'
	replace colname="`vlabel'" in `x'
	
	*number of observations
	count if `var' < .
	replace coln = `r(N)' in `x'
	
	*regress missing indicator on treatment and store results
	reg `var'_miss treated, cluster(community_id)
	replace col1 = _b[treated] in `x'
	test treated
	replace colp=r(p) in `x'
	
	*increase local to move to next row
	local x=`x'+1
}

*label columns for export
label var colname "Variable"
label var coln "Non-Missing Obs"
label var col1 "Treated"
label var colp "p-value"

*Round data neatly
replace col1=round(col1,.001)
tostring col1, format(%5.0g) replace force
foreach var of varlist col1 {
	replace `var'="" if `var'=="."
}

*Add in significance stars
replace col1 = col1+"*" if colp<=.1 & colp>.05
replace col1 = col1+"**" if colp<=.05 & colp>.01
replace col1 = col1+"***" if colp<.01
replace col1 = "(" + col1 + ")" if colp==. & col1!=""

*Round extremely small values to zero
replace col1="0.00" if regexm(col1,"e")

tostring colp, format(%9.3f) replace force
replace colp="" if colp=="."

count if colname!=""
local num = `r(N)'

*export results to LaTeX
texsave colname coln col1 colp using "Output/SI_Table2_missings.tex" in 1/`num', ///
	varlabels frag replace location(h)  width(\linewidth) title("Missings Table \label{tab:miss}") ///
	nofix hlines(0 `num') ///
	footnote("Note: This table presents missing balance for the Pathways intervention at the endline survey, meaning each row represents a regression of whether a given outcome was missing or not on a treatment indicator. The total number of girls in the sample was 1,171, of whom 1,066 took the survey. The variables are divided into primary outcomes which are the core outcomes in the study, and secondary outcomes which appear only in the extended data tables. In this analysis: i) all coefficients are estimated using multivariate ordinary least squares regression; ii) p-values are calcuated based on two-sided tests using robust standard errors are clustered at the community level; and iii) indices, where indicated, are expressed in standard deviation units.")
drop colname - colp
drop unmarried_miss-ybro_shareschool_miss
